library(tidyverse)

# Figure S1: Essay content and style correlation figure
################################################################################

merged_df <- read.csv("merged_final.csv")
dict_df <- merged_df[,76:167]
topic_df <- merged_df[,6:75]

merged_topic_labs <- c("Winning_Competitions",
                       "Math",
                       "AP_Classes",
                       "Work_And_Goals",
                       "Camping_Swimming",
                       "Social_Anxiety","Gendered_Activities",
                       "Fashion_Style","Family_Members","Medical_Experiences",
                       "Helping_Others","Despite_Words","Latinx_Family_Issues",
                       "Education_Opportunity","Classroom_Experiences",
                       "Youth_Volunteering", "Reading_Writing","Making_Planning",
                       "Visual_Art","Travel","Leadership_Skills","Seeking_Answers",
                       "Mental_Health","Outside_School_Programs","Volunteer_Cleaning",
                       "Work_Experiences", "Family_Death","Motivations_Goals",
                       "Psychology_Understanding","Group_Leadership","Sports_Experiences",
                       "World_Histories","China","Language_Experiences","Cooking",
                       "Civic_Experiences","Time_Management","Sensory_Experiences",
                       "Sociocultural_Diversity","Business_Economics","Performance_Art",
                       "Computer_Science","Photography","School_Activities",
                       "Humor_Storytelling","Group_Assignments","Work_Money",
                       "Process_Words","Boy_Scouts","Video_Film","Family_Church",
                       "Building_Engines","Human_Nature","Music","Life_Reflections",
                       "Time_Cycles","Life_Challenges","Sensory_Responses","HS_Years",
                       "Sports_General","School_Grades","Dancing_Art","Community_Service",
                       "Preference_Words", "Achievement_Words","Puzzles_Problems",
                       "Chemistry_Biology", "Tutoring_Groups","Physics",
                       "New_Exepriences")

colnames(topic_df) <- merged_topic_labs

col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582",
                           "#FDDBC7", "#FFFFFF", "#D1E5F0", "#92C5DE",
                           "#4393C3", "#2166AC", "#053061"))

topic_df$Income <- merged_df$FAMILY_INCOME
topic_df$SAT <- merged_df$RSAT_TOTAL_SCORE
topic_df$RSAT_EBRW <- merged_df$RSAT_EBRW
topic_df$RSAT_MATH <- merged_df$RSAT_MATH_SCORE

topic_df_inc <- topic_df %>%
  filter(Income > 10000)

dict_df$Income <- merged_df$FAMILY_INCOME
dict_df$SAT <- merged_df$RSAT_TOTAL_SCORE
dict_df$RSAT_EBRW <- merged_df$RSAT_EBRW
dict_df$RSAT_MATH <- merged_df$RSAT_MATH_SCORE

dict_df_inc <- dict_df %>%
  filter(Income > 10000)

dict_cor_df <- as.data.frame(cor(dict_df_inc[,1:92], dict_df_inc[,93:96]))
topic_cor_df <- as.data.frame(cor(topic_df_inc[,1:70], topic_df_inc[,71:74]))

dict_cor_df <- dict_cor_df[rev(order(dict_cor_df$RSAT_EBRW + dict_cor_df$RSAT_MATH + dict_cor_df$Income + dict_cor_df$SAT)),]
topic_cor_df <- topic_cor_df[rev(order(topic_cor_df$RSAT_EBRW + topic_cor_df$RSAT_MATH + topic_cor_df$Income + topic_cor_df$SAT)),]

corrplot::corrplot(as.matrix(topic_cor_df), 
                   method = "color", tl.cex = 0.50, tl.srt = 60,
                   tl.col = "black", cl.pos = "n")

# Topics
corrplot::colorlegend(colbar = col2(50),
                      labels = seq(-1,1,by = 0.2), 
                      xlim = c(6,8), ylim = c(20,52), vertical = TRUE, 
                      align = "l", offset = .25)
mtext("Merged Essays", at=36.5, line=-18, cex=1.45, side = 2)

dev.off()

# Dict
corrplot::corrplot(as.matrix(dict_cor_df), 
                   method = "color", tl.cex = 0.50, tl.srt = 60,
                   tl.col = "black", cl.pos = "n")

dev.off()

